# ======= read data Accuracy II, experiment 1 ======= #
# Folco Panizza, 2022/05/20, revised 2022/08/03

# load libraries ===================
library(data.table)          # extention of data.frame
setDTthreads(threads = 0)    # number of threads to be used for data.table parallelised operations
library(here)                # find your project's files, based on the current working directory at the time when the package is loaded
library(jsonlite)            # JSON Parser and Generator for R
library(lubridate)           # functions to work with date-times and time-spans


# custom functions =================




# read Prolific data ===============
prolific = fread(here("Raw data","experiment 1","prolific.csv"), encoding = "UTF-8", stringsAsFactors = T) # stringsAsFactors=0.10 will factorize any character column containing under 0.10*nrow unique strings

# rename variables and order the factor levels
# oldnames = c("Climate Change", "Country of Birth", "Current Country of Residence","Employment Status","First Language","Highest education level completed","Household Income (GBP)","Household Size","Student Status")
oldnames = c("Climate change", "Country of birth", "Country of residence","Employment status","Language","Highest education level completed","Household income (gbp)","Household size","Student status", "Status", "Submission id", "Started at", "Completed at", "Time taken", "Age", "Total approvals", "Total rejections", "Approval rate", "Reviewed at", "Completion code", "Social-media")
newnames = c("climate_change", "birth",            "residence",                   "employment",       "language",      "education",                        "household_income",      "household_size","student", "status", "session_id", "started_datetime", "completed_date_time", "time_taken", "age", "num_approvals", "num_rejections", "prolific_score", "reviewed_at_datetime", "entered_code", "Social-Media")
setnames(prolific, oldnames, newnames)

educationLevels = c("Don't know / not applicable","No formal qualifications","Secondary education (e.g. GED/GCSE)","High school diploma/A-levels",
                    "Technical/community college","Undergraduate degree (BA/BSc/other)","Graduate degree (MA/MSc/MPhil/other)","Doctorate degree (PhD/other)")
educationLabels = c("other", "no qualifications", "secondary", "high school", "college", "undergraduate", "graduate", "doctorate")
prolific[, education := factor(education, levels = educationLevels, labels = educationLabels)]

employmentLevels = c("Full-Time","Unemployed (and job seeking)","Other","Not in paid work (e.g. homemaker', 'retired or disabled)","Part-Time","Due to start a new job within the next month") #,"DATA EXPIRED"
employmentLabels = c('full-time','unemployed',                  'other','not in paid work',                                        'part-time','due to start') #,'DATA EXPIRED'
prolific[, employment := factor(employment, levels = employmentLevels, labels = employmentLabels)]

climateLevels = c("No", "Yes", "Don't know","Not applicable / rather not say","")
climateLabels = c("No", "Yes", "Don't know/not say", "Don't know/not say", "N/A")
prolific[, climate_change := factor(climate_change, levels = climateLevels, labels = climateLabels)]

incomeLevels = c("Less than £10,000 ","£10,000 - £15,999","£20,000 - £29,999","£30,000 - £39,999","£40,000 - £49,999","£50,000 - £59,999","£60,000 - £69,999","£70,000 - £79,999","£80,000 - £89,999","£90,000 - £99,999","£100,000 - £149,999","More than £150,000","Rather not say")
prolific[, household_income := factor(household_income, levels = incomeLevels)]

rm(list = ls()[ls()!="prolific"])



# read Qualtrics data ==============
Qualtrics = fread(here("Raw data","experiment 1","qualtrics.csv"), encoding = "UTF-8")

scores = names(Qualtrics) %like% "SC[0-9]" # rename scoring variables 
names(Qualtrics)[scores] <- unlist(Qualtrics[1, ..scores]); rm(scores)

questions = t(Qualtrics[1])                # the questions associated with each variable
Qualtrics = Qualtrics[-1:-2, -"JS-data"]   # We'll read JSON data another way # the first two rows are not participant data

# write temporary file
fwrite(x = Qualtrics, file = here('Data',"experiment 1", 'questionnaire.csv'), bom = TRUE) # https://stackoverflow.com/questions/52554665/how-can-i-specify-encode-in-fwrite-for-export-csv-file-r

# numeric variables previously classified as character are now correctly numeric
Qualtrics  = fread(here("Data","experiment 1","questionnaire.csv"), encoding = "UTF-8", stringsAsFactors = T) 

# remove empty responses
# Qualtrics[Finished==FALSE, table(tag)]
# Qualtrics[Finished==TRUE, .N] 1002 # experiment 2
Qualtrics = Qualtrics[Finished==TRUE]
Qualtrics = Qualtrics[Validity!=""]

# total reading time of instructions
Qualtrics[, intro_duration := sum(c(`Time_intro1_Page Submit`,`Time_intro2_Page Submit`, `Time_intro3_Page Submit`), na.rm = T), ResponseId]

# accuracy rating:
Qualtrics[, accuracy := (validity * valid + 5)/10] # normalised to [0,1]
Qualtrics[, accuracy_score := accuracy*5+1]

# correct guessing:
Qualtrics[, correct := round(accuracy)]

# lateral reading and click restraint (self-report)
Qualtrics[, lateral := as.factor(Where%like%"Search")]
Qualtrics[, restraint := as.factor(ClickRestraint%like%"subsequent")]

# response extremity
Qualtrics[, certainty := factor(abs(validity),labels=c("possibly","probably","definitely"))]

# add prolific information
# Qualtrics = merge(Qualtrics, prolific, by.x="PROLIFIC_PID", by.y="participant_id", all.x = T); rm(prolific)
Qualtrics = merge(Qualtrics, prolific, by.x="PROLIFIC_PID", by.y="Participant id", all.x = T); rm(prolific)

# remove any non-paid participants e.g. timed-out
Qualtrics = Qualtrics[status=='APPROVED']

# Order main columns
setcolorder(Qualtrics, c("PROLIFIC_PID", "ResponseId", "incentive", "tag", "topic", "source", "accuracy", "correct", "lateral", "restraint"))

# set key variables
setkey(Qualtrics, PROLIFIC_PID, ResponseId, tag, incentive)

# Save file
fwrite(x = Qualtrics, file = here('Data',"experiment 1", 'questionnaire.csv'), bom = TRUE)
save(Qualtrics, file = here('Data',"experiment 1", 'questionnaire.Rdata'))





# read JSON data =============
JSON = setDT(read.csv(here("Raw data","experiment 1","qualtrics.csv"), encoding = "UTF-8", stringsAsFactors = F)) # fread messes up with encoding of double quotes (it doubles double quotes!)
JSON = JSON[-1:-2]                                             # the first two rows are not participant data

JSON = JSON[Validity!=""] # remove empty responses

JSON = JSON[JS.data!="", {                                                # convert JSON to readable variables
  JSdata =  fromJSON(`JS.data`, flatten = T)
  JSdata[, sapply(JSdata, class) == 'integer'] <- lapply(JSdata[, sapply(JSdata, class) == 'integer'], as.double) # some variables are stored as integers in some participants and as doube in some others, this command trasnforms everything into double
  if(!("url.X-OpenDNS-Session" %in% names(JSdata))) { # some computers add an URL parameter
    JSdata$`url.X-OpenDNS-Session` = c("","","")
  } 
  JSdata = JSdata[ , order(names(JSdata))]
  JSdata
}, by=.(PROLIFIC_PID, ResponseId)] 
JSON = JSON[sender=="Post"]                                    # keep only JSON data related to the post
vars = c("PROLIFIC_PID", "ResponseId", "condition", "Name", "facebook", "website", "wikipedia", "creative", "whois", 
         "leave", "comeback", "info_open", "info_close", "blur", "focus", "duration","time_show","time_end","time_commit","timestamp",
         "meta.screen_width", "meta.screen_height", "meta.userAgent")
JSON = JSON[, ..vars]; rm(vars)                                # keep only relevant variables
setnames(JSON, old = c("duration"   ,"time_show"   ,"time_end"       ,"time_commit","timestamp"      , "meta.screen_width", "meta.screen_height", "meta.userAgent"), 
         new = c("active_time", "render_time","completion_time","record_time","record_datetime", "screen_width"     , "screen_height"     , "userAgent"))
setnames(JSON, old = "Name", new = "tag")

JSON[, record_datetime := as_datetime(record_datetime)] # convert to POSIX

# add variables from Qualtrics
# JSON = merge(JSON, Qualtrics[, .(accuracy, topic, source, INCENTIVE = incentive, first_click = `Q_time_First Click`*1000, last_click = `Q_time_Last Click`*1000, completion_timeQ = `Q_time_Page Submit`*1000, click_count = `Q_time_Click Count`, blurQ = lapply(strsplit(as.character(blur), split = ", "), as.character), focusQ = lapply(strsplit(as.character(focus), split = ", "), as.character), started_datetimeQ = StartDate, started_datetimeP = started_datetime, intro_duration, ResponseId)], by = "ResponseId")
JSON = merge(JSON, Qualtrics[, .(accuracy, topic, source, INCENTIVE = incentive, first_click = `Q_time_First Click`*1000, last_click = `Q_time_Last Click`*1000, completion_timeQ = `Q_time_Page Submit`*1000, click_count = `Q_time_Click Count`, blurQ = lapply(strsplit(as.character(blur), split = ", "), as.character), focusQ = lapply(strsplit(as.character(focus), split = ", "), as.character), started_datetimeQ = StartDate, started_datetimeP = started_datetime, intro_duration, ResponseId)], by = "ResponseId")


# quality checks
JSON[, mobile_user := userAgent %like% "Android|iPhone|iPad"]
JSON[, completion_mismatch := abs(completion_time-completion_timeQ)>3000] # absolute difference greater than 3 seconds
# ggplot(data=unique(JSON[abs(lag)<10000,abs(lag),ResponseId]), aes(x=V1)) + geom_histogram()

# order events in long format
events = c("facebook", "website", "wikipedia", "creative", "whois", "leave", "comeback", "info_open", "info_close", "blur", "focus", "blurQ", "focusQ")
timeline = JSON[, .(event = names(unlist(.SD)), timestamp = sapply(unlist(.SD), function(x) gsub('\\|.*$','', x)), time = sapply(unlist(.SD), function(x) gsub('^.*\\|','', x))), .SDcols = events, by = .(ResponseId, lag=completion_timeQ-completion_time)]
timeline[, event := gsub('[0-9]+', '', event)]  # remove numbers from the name of the event

timeline[,time:=as.numeric(time)] # convert relative time to numeric
timeline[,timestamp:=as_datetime(timestamp)] # convert absolute time to POSIX

timeline[,started_datetime := { # absolute timestamp for this.timer == 0
  eventJS = !(event%like%'Q')   # if there are JavaScript events
  if(any(eventJS)) {
    i =  which(eventJS)[1]      # select the first one (any event should do)
    timestamp[i] - time[i]/1000 # compute absolute timestamp
  } else {NA}},ResponseId]
timeline[!is.na(started_datetime) & event%like%'Q', time := (timestamp - started_datetime)*1000] # add relative time to Q events

setorder(timeline, ResponseId, timestamp) # order by participant, by timeline

JSON = timeline[JSON[,-..events], on="ResponseId"]; rm(events, timeline) # add other variables such as screen size and duration (ms)

JSON = unique(JSON) # remove duplicate rows

# Time of post evaluation (accounting for popup reading time)
# JSON[, post_evaluation := active_time]
# JSON[, active_time]

# adjust time for non-Qualtrics events
# JSON[, time_ := fifelse(event%like%'Q', time, time + render_time + lag)]

setorder(JSON, ResponseId, timestamp, time)                                 #"time_", 
setcolorder(JSON, c("PROLIFIC_PID", "ResponseId", "INCENTIVE", "tag", "event", "timestamp", "time", "started_datetime", "record_datetime", "active_time", "render_time", "completion_time", "completion_timeQ", "first_click", "last_click"))

# Save file
fwrite(x = JSON, file = here("Data","experiment 1",'search-data.csv'), bom = TRUE)

JavaScript = JSON; rm(JSON) # rename dataset
save(JavaScript, file = here("Data","experiment 1",'search-data.Rdata'))
